Load Packages
library(arrow)
library(tidyverse)
library(ggplot2)
library(ggbeeswarm)
library(dplyr)
Upload Parquet File:
d1 = read_parquet("C:/Users/asare/Downloads/prevalence_by_geography_and_year_and_source (2).parquet")
head(d1)
Add Regions Column:
d1 = d1 %>%
mutate(region = case_when(
geography %in% c("Illinois", "Indiana", "Iowa", "Kansas", "Michigan", "Minnesota",
"Missouri", "Nebraska", "North Dakota", "Ohio", "South Dakota", "Wisconsin") ~ "Midwest",
geography %in% c("Connecticut", "Maine", "Massachusetts", "New Hampshire", "New Jersey",
"New York", "Pennsylvania", "Rhode Island", "Vermont") ~ "Northeast",
geography %in% c("Alaska", "Arizona", "California", "Colorado", "Hawaii", "Idaho",
"Montana", "Nevada", "New Mexico", "Oregon", "Utah", "Washington", "Wyoming") ~ "West",
geography %in% c("Alabama", "Arkansas", "Delaware", "District of Columbia", "Florida", "Georgia",
"Kentucky", "Louisiana", "Maryland", "Mississippi", "North Carolina", "Oklahoma",
"South Carolina", "Tennessee", "Texas", "Virginia", "West Virginia") ~ "South",
TRUE ~ "Other"
))
#Separate Datasets by United States only and States Only
state = d1 %>% filter(! region %in% c("Other"))
us = d1 %>% filter(region == "Other")
Create Separate Region Only Dataset
Midwest = c("Illinois", "Indiana", "Iowa", "Kansas", "Michigan", "Minnesota",
"Missouri", "Nebraska", "North Dakota", "Ohio", "South Dakota", "Wisconsin")
Northeast = c("Connecticut", "Maine", "Massachusetts", "New Hampshire", "New Jersey",
"New York", "Pennsylvania", "Rhode Island", "Vermont")
West = c("Alaska", "Arizona", "California", "Colorado", "Hawaii", "Idaho",
"Montana", "Nevada", "New Mexico", "Oregon", "Utah", "Washington", "Wyoming")
South = c("Alabama", "Arkansas", "Delaware", "District of Columbia", "Florida", "Georgia",
"Kentucky", "Louisiana", "Maryland", "Mississippi", "North Carolina", "Oklahoma",
"South Carolina", "Tennessee", "Texas", "Virginia", "West Virginia")
Midwest <- c("Illinois", "Indiana", "Iowa", "Kansas", "Michigan", "Minnesota",
"Missouri", "Nebraska", "North Dakota", "Ohio", "South Dakota", "Wisconsin")
Northeast <- c("Connecticut", "Maine", "Massachusetts", "New Hampshire", "New Jersey",
"New York", "Pennsylvania", "Rhode Island", "Vermont")
West <- c("Alaska", "Arizona", "California", "Colorado", "Hawaii", "Idaho",
"Montana", "Nevada", "New Mexico", "Oregon", "Utah", "Washington", "Wyoming")
South <- c("Alabama", "Arkansas", "Delaware", "District of Columbia", "Florida", "Georgia",
"Kentucky", "Louisiana", "Maryland", "Mississippi", "North Carolina", "Oklahoma",
"South Carolina", "Tennessee", "Texas", "Virginia", "West Virginia")
region_df <- data.frame(
state = c(Midwest, Northeast, West, South),
region = c(rep("Midwest", length(Midwest)),
rep("Northeast", length(Northeast)),
rep("West", length(West)),
rep("South", length(South)))
)
Create Map
library(geofacet)
#install.packages("choroplethr")
#library(choroplethr)
#install.packages("plotly")
library(plotly)
state_choropleth(region_df, geoid.name = 'state', value.name = 'region')
ggsave("color_coded_map_region.png", width = 10, height = 10)

Look at Total Regional Prevalence for Obesity and Diabetes
fig1 = ggplot(state %>% filter(year == "2024", age == "Total", outcome_name == "Diabetes"), aes(x = region, y = value, color = geography, group = region)) +
geom_quasirandom(dodge.width = 0.8, size = 2.5, alpha = 0.8) +
facet_wrap(~source) +
labs(
title = "Total 2024 Diabetes Prevalence by Region",
subtitle = "CDC BRFSS, Epic Cosmos: HbA1c, Epic Cosmos: ICD10 Data",
x = "Region",
y = "Prevalence (%)",
color = "Region:"
) +
theme_minimal(base_size = 14) +
theme(
axis.text.x = element_text(size = 16, angle = 45, hjust = 1),
plot.title = element_text(size = 20, hjust = 0.5, face = "bold"),
plot.subtitle = element_text(size = 14, hjust = 0.5), panel.spacing = unit(3.5, "lines"), strip.text = element_text(size = 15, face = "bold"), legend.position = "none"
)
fig1 = ggplotly(fig1, tooltip = c("geography", "value"))
fig1
# Save ggplotly as widget in file test.html
saveWidget(ggplotly(fig1), file = "diab_int.html")
test_df = state %>% filter(year == "2024", age == "Total", outcome_name == "Obesity", region == "South") %>% select(geography, region, value, source)
write.csv(test_df , "mydata.csv", row.names = FALSE)
ggplot(state %>%
filter(year != "2025",
age == "Total",
outcome_name == "Diabetes",
source != "Medicare FFS"),
aes(x = year, y = value, color = geography, group = geography)) +
geom_line() +
facet_wrap(~ region) +
labs(
title = "Total 2024 Diabetes Prevalence by Region",
subtitle = "CDC BRFSS, Epic Cosmos: HbA1c, Epic Cosmos: ICD10 Data",
x = "Year",
y = "Prevalence (%)",
color = "State"
) +
theme_minimal(base_size = 14) +
theme(
axis.text.x = element_text(size = 16, angle = 45, hjust = 1),
plot.title = element_text(size = 20, hjust = 0.5, face = "bold"),
plot.subtitle = element_text(size = 16, hjust = 0.5),
panel.spacing = unit(3.5, "lines"),
strip.text = element_text(size = 16, face = "bold")
)

---
title: "SubStack Blog Post"
output: html_notebook
---

Load Packages

```{r}

library(arrow)
library(tidyverse)
library(ggplot2)
library(ggbeeswarm)
library(dplyr)


```

Upload Parquet File:

```{r}
d1 = read_parquet("C:/Users/asare/Downloads/prevalence_by_geography_and_year_and_source (2).parquet")

head(d1)
```


Add Regions Column:

```{r}
d1 = d1 %>%
  mutate(region = case_when(
    geography %in% c("Illinois", "Indiana", "Iowa", "Kansas", "Michigan", "Minnesota", 
                 "Missouri", "Nebraska", "North Dakota", "Ohio", "South Dakota", "Wisconsin") ~ "Midwest",
    geography %in% c("Connecticut", "Maine", "Massachusetts", "New Hampshire", "New Jersey", 
                 "New York", "Pennsylvania", "Rhode Island", "Vermont") ~ "Northeast",
    geography %in% c("Alaska", "Arizona", "California", "Colorado", "Hawaii", "Idaho", 
                 "Montana", "Nevada", "New Mexico", "Oregon", "Utah", "Washington", "Wyoming") ~ "West",
    geography %in% c("Alabama", "Arkansas", "Delaware", "District of Columbia", "Florida", "Georgia", 
                 "Kentucky", "Louisiana", "Maryland", "Mississippi", "North Carolina", "Oklahoma", 
                 "South Carolina", "Tennessee", "Texas", "Virginia", "West Virginia") ~ "South",
    TRUE ~ "Other"
  ))


#Separate Datasets by United States only and States Only

state = d1 %>% filter(! region %in% c("Other"))
us = d1 %>% filter(region == "Other")
```


Create Separate Region Only Dataset
```{r}
Midwest = c("Illinois", "Indiana", "Iowa", "Kansas", "Michigan", "Minnesota", 
                 "Missouri", "Nebraska", "North Dakota", "Ohio", "South Dakota", "Wisconsin") 
Northeast = c("Connecticut", "Maine", "Massachusetts", "New Hampshire", "New Jersey", 
                 "New York", "Pennsylvania", "Rhode Island", "Vermont") 
West = c("Alaska", "Arizona", "California", "Colorado", "Hawaii", "Idaho", 
                 "Montana", "Nevada", "New Mexico", "Oregon", "Utah", "Washington", "Wyoming")
South = c("Alabama", "Arkansas", "Delaware", "District of Columbia", "Florida", "Georgia", 
                 "Kentucky", "Louisiana", "Maryland", "Mississippi", "North Carolina", "Oklahoma", 
                 "South Carolina", "Tennessee", "Texas", "Virginia", "West Virginia")

Midwest <- c("Illinois", "Indiana", "Iowa", "Kansas", "Michigan", "Minnesota",
             "Missouri", "Nebraska", "North Dakota", "Ohio", "South Dakota", "Wisconsin")

Northeast <- c("Connecticut", "Maine", "Massachusetts", "New Hampshire", "New Jersey",
               "New York", "Pennsylvania", "Rhode Island", "Vermont")

West <- c("Alaska", "Arizona", "California", "Colorado", "Hawaii", "Idaho",
          "Montana", "Nevada", "New Mexico", "Oregon", "Utah", "Washington", "Wyoming")

South <- c("Alabama", "Arkansas", "Delaware", "District of Columbia", "Florida", "Georgia",
           "Kentucky", "Louisiana", "Maryland", "Mississippi", "North Carolina", "Oklahoma",
           "South Carolina", "Tennessee", "Texas", "Virginia", "West Virginia")

region_df <- data.frame(
  state = c(Midwest, Northeast, West, South),
  region = c(rep("Midwest", length(Midwest)),
             rep("Northeast", length(Northeast)),
             rep("West", length(West)),
             rep("South", length(South))))
```

Create Map
```{r}
library(geofacet)
```
```{r}
#install.packages("choroplethr")
#library(choroplethr)
#install.packages("plotly")
library(plotly)

```

```{r}
state_choropleth(region_df, geoid.name = 'state', value.name = 'region')

ggsave("color_coded_map_region.png", width = 10, height = 10)
```
Look at Total Regional Prevalence for Obesity and Diabetes

```{r}

ggplot(state %>% filter(year == "2024", age == "Total", outcome_name == "Diabetes"), aes(x  = region, y = value, color = region)) +  
  geom_quasirandom(dodge.width = 0.8, size = 2.5, alpha = 0.8)  +
  facet_wrap(~source) +
  labs(
    title = "Total 2024 Diabetes Prevalence by Region",
    subtitle = "CDC BRFSS, Epic Cosmos: HbA1c, Epic Cosmos: ICD10 Data",
    x = "Region",
    y = "Prevalence (%)",
    color = "Region:"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(size = 16, angle = 45, hjust = 1),
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(size = 14, hjust = 0.5), panel.spacing = unit(3.5, "lines"), strip.text = element_text(size = 15, face = "bold")
  )


ggsave("total_rp_bee.png", width = 10, height = 8)

#Diabetes Interactive Plot


fig1 = ggplot(state %>% filter(year == "2024", age == "Total", outcome_name == "Diabetes"), aes(x  = region, y = value, color = geography, group = region)) +  
  geom_quasirandom(dodge.width = 0.8, size = 2.5, alpha = 0.8)  +
  facet_wrap(~source) +
  labs(
    title = "Total 2024 Diabetes Prevalence by Region",
    subtitle = "CDC BRFSS, Epic Cosmos: HbA1c, Epic Cosmos: ICD10 Data",
    x = "Region",
    y = "Prevalence (%)",
    color = "Region:"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(size = 16, angle = 45, hjust = 1),
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(size = 14, hjust = 0.5), panel.spacing = unit(3.5, "lines"), strip.text = element_text(size = 15, face = "bold"), legend.position = "none"
  )


fig1 = ggplotly(fig1, tooltip = c("geography", "value"))
fig1
# Save ggplotly as widget in file test.html
saveWidget(ggplotly(fig1), file = "diab_int.html")

#Obesity


ggplot(state %>% filter(year == "2024", age == "Total", outcome_name == "Obesity"), aes(x  = region, y = value, color = region)) +  
  geom_quasirandom(dodge.width = 0.8, size = 2.5, alpha = 0.8)  +
  facet_wrap(~source) +
  labs(
    title = "Total 2024 Obesity Prevalence by Region",
    subtitle = "CDC BRFSS, Epic Cosmos: BMI, Epic Cosmos: ICD10 Data",
    x = "Region",
    y = "Prevalence (%)",
    fill = "Data Source", color = "Region:"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(size = 16, angle = 45, hjust = 1),
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(size = 16, hjust = 0.5), panel.spacing = unit(3.5, "lines"), strip.text = element_text(size = 15, face = "bold")
  )

ggsave("obs_total_rp_bee.png", width = 10, height = 8)


#Obesity Interactive Plot


fig2 = ggplot(state %>% filter(year == "2024", age == "Total", outcome_name == "Obesity"), aes(x  = region, y = value, color = geography, group = region)) +  
  geom_quasirandom(dodge.width = 0.8, size = 2.5, alpha = 0.8)  +
  facet_wrap(~source) +
  labs(
    title = "Total 2024 Obesity Prevalence by Region",
    subtitle = "CDC BRFSS, Epic Cosmos: BMI, Epic Cosmos: ICD10 Data",
    x = "Region",
    y = "Prevalence (%)",
    fill = "Data Source", color = "Region:"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(size = 16, angle = 45, hjust = 1),
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(size = 16, hjust = 0.5), panel.spacing = unit(3.5, "lines"), strip.text = element_text(size = 15, face = "bold"), legend.position = "none"
  )

fig2 = ggplotly(fig2, tooltip = c("geography", "value"))
fig2
# Save ggplotly as widget in file test.html
saveWidget(ggplotly(fig2), file = "obs_int.html")

#Create account with plotly -- add user name and api key to rstudio -- api_create(name of plot, name of desired file) --> publish graph to online plotly account --> use url as iframe in substack to embed html in blog post

# Install necessary packages if you haven't already
# install.packages("plotly")
# install.packages("htmlwidgets")

library(plotly)
library(htmlwidgets)
library(ggplot2) # Optional, if you're converting a ggplot

# Example: Create a plotly graph
p <- plot_ly(data = diamonds, x = ~carat, y = ~price, text = ~paste("Clarity: ", clarity),
             mode = "markers", color = ~carat, size = ~carat)

# Save the plot as a standalone HTML file
saveWidget(as_widget(p), "plotly_graph.html", selfcontained = TRUE)


```

```{r}

#Obesity
test_df = state %>% filter(year == "2024", age == "Total", outcome_name == "Obesity") %>% select(geography, region, value, source)

wide_df <- test_df |>
  pivot_wider(
    names_from = geography,
    values_from = value
  )


write.csv(wide_df , "mydata3.csv", row.names = FALSE)

#Diabetes


diab_df = state %>% filter(year == "2024", age == "Total", outcome_name == "Diabetes") %>% select(geography, region, value, source)

wide_diab <- diab_df |>
  pivot_wider(
    names_from = source,
    values_from = value
  )


write.csv(wide_diab , "mydata4.csv", row.names = FALSE)

```

```{r}
ggplot(state %>%
         filter(year != "2025",
                age == "Total",
                outcome_name == "Diabetes",
                source != "Medicare FFS"),
       aes(x = year, y = value, color = geography, group = geography)) +
  geom_line() +
  facet_wrap(~ region) +
  labs(
    title = "Total 2024 Diabetes Prevalence by Region",
    subtitle = "CDC BRFSS, Epic Cosmos: HbA1c, Epic Cosmos: ICD10 Data",
    x = "Year",
    y = "Prevalence (%)",
    color = "State"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.x = element_text(size = 16, angle = 45, hjust = 1),
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold"),
    plot.subtitle = element_text(size = 16, hjust = 0.5),
    panel.spacing = unit(3.5, "lines"),
    strip.text = element_text(size = 16, face = "bold")
  )
```

